##############################################
# The Archipelago Capitalism of Citizenship-by-Investment
# Jelena Dzankic, Mira Seyfettinoglu, Ayelet Shachar, Maarten Vink, Luuk van der Baaren
# Comparative Political Studies, accepted for publication, October 2025
#
# Code to prepare the main analysis dataset
# Code prepared by Maarten Vink with research assistance by Tarek Jaziri Arjona
##############################################

version #R version 4.5.1 (2025-06-13)

#start with clean workspace
rm(list=ls(all=TRUE))

#load packages
library(devtools)
library(WDI)
#devtools::install_github("vdeminstitute/vdemdata")
library(vdemdata)
#remotes::install_github("xmarquez/democracyData")
library(democracyData)
library(tidyverse)
library(haven)
library(readxl)
library(writexl)
library(naniar)
library(DataExplorer)

# make a project directory and save all files from the dataverse

# create sub-folders to organise output
# create directory if it does not already exist
ifelse(!dir.exists("01_data"),
       dir.create("01_data"),
       "Directory Exists")
ifelse(!dir.exists("02_plots_tables"),
       dir.create("02_plots_tables"),
       "Directory Exists")


# read the dataset
dat <- read_excel("cbi_data.xlsx")
summary(dat) # 11002 obs of 24 variables

#create world_region variable with labelled categories
dat$world_region <- ifelse(dat$world_region == 1, "Africa",
                           ifelse(dat$world_region == 2, "Asia",
                                  ifelse(dat$world_region == 3, "Europe",
                                         ifelse(dat$world_region == 6, "Oceania",
                                                ifelse(dat$world_region == 4 | dat$world_region == 5,
                                                       "Americas", NA)))))

# add binary variable for generic investor provisions (cbi_gen)
# note that cbi_cat == 1 ('generic') measures whether country *only* have a generic provision in place
# in practice, countries may maintain a generic provision when they introduce a CBI policy
# and going from specific to generic does not imply introducing generic
# for this reason we derive cbi_gen as follows:
dat <- dat |>
  group_by(iso3c) |>
  mutate(cbi_gen0 = ifelse(cbi_cat == 1, 1,0)) |> # variable that codes whether generic investor provision (cbi_cat == 1) in place
  mutate(cbi_gen_mean = mean(cbi_gen0)) |> # calculate the mean to see if country ever has generic investor provision
  mutate(cbi_gen = ifelse(cbi_cat == 1 | cbi_cat == 2, 1, # code 1 if generic investor provision (cbi_cat == 1) or specific investor citizenship provision without amount (cbi_cat == 2)
                          ifelse(cbi_cat == 3 & cbi_gen_mean > 0, 1, #also generic if CBI and ever had 'generic only'
                                 0))) |>
  ungroup()

# add variables that measures the introduction (cbi_intro) and removal (cbi_remov) of CBI narrowly defined and the type of change in a country in a year
# include CBI introduction by new country (first year) but exclude cbi in 1960 because no information on policy in 1959 (so don't know if intro in that year)
dat <- dat |>
  group_by(iso3c) |>
  mutate(cbi_intro = ifelse((cbi_binary != dplyr::lag(cbi_binary) & cbi_binary == 1) | cbi_binary == 1 & year == min(year) & year != 1960, 1, 0)) |>
  mutate(cbi_remov = ifelse(cbi_binary != dplyr::lag(cbi_binary) & cbi_binary == 0 & year != min(year), 1, 0)) |>
  ungroup() |>
  mutate(cbi_change = ifelse(cbi_intro == 1, "introduction",
                                  ifelse(cbi_remov == 1, "removal", "no change")))


# add variables that measures the introduction and removal of generic provision (cbi_gen) and the type of change in a country in a year
# important to account for situation where countries maintain a generic provision when they introduce a CBI policy
dat <- dat |>
  group_by(iso3c) |>
  mutate(cbi_gen_intro = ifelse((cbi_gen != dplyr::lag(cbi_gen) & cbi_gen == 1) | cbi_gen == 1 & year == min(year) & year != 1960, 1, 0)) |>
  mutate(cbi_gen_remov = ifelse(cbi_gen != dplyr::lag(cbi_gen) & cbi_gen == 0 & year != min(year), 1, 0)) |>
  ungroup()  |>
  mutate(cbi_gen_change = ifelse(cbi_gen_intro == 1, "introduction",
                                 ifelse(cbi_gen_remov == 1, "removal", "no change"))) 

summary(dat)
dim(dat) #11002    33


#### adding world bank data
### data up to 2023
### NB note that data availability in WDI changes over time
# running this script may produce slight differences in analysis dataset
wd <- WDI(country = "all", indicator = c("SP.POP.TOTL", "NY.GDP.PCAP.CD"), 
          start = 1960, end = 2023, extra = TRUE)
summary(wd) # 17024 obs of 14 vars
# check missings
plot_missing(wd)
summary(wd$NY.GDP.PCAP.CD)
# Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
# 11.8    572.9   1876.1   8503.7   7836.4 256580.5     2714 
wd %>% summarise(across(everything(), ~ sum(!is.na(.))))
# NY.GDP.PCAP.CD: 14310
# note this includes regional totals/averages

#merge wdi data with cbi dataset
dat1 <- merge(dat, wd, by = c("iso3c", "iso2c", "year"),
              all.x = TRUE)
dat1 %>% summarise(across(everything(), ~ sum(!is.na(.))))
# NY.GDP.PCAP.CD: 10190
summary(dat1)

# Replace Not classified by NA
dat1$income[dat1$income == 'Not classified'] <- NA

## add microstate variable based on population < 1M
dat1 <- dat1 |>
  group_by(iso3c) |>
  mutate(pop_mean = mean(SP.POP.TOTL, na.rm=TRUE)) |>
  mutate(microstate = ifelse(pop_mean < 1000000, 1, 0)) |>
  ungroup()
# check missing microstate
missing_micro <- dat1 |> 
  filter(is.na(microstate))
# still missing: DDR and Taiwan
# imputing values for Taiwan and DDR, both are not microstates
dat1 <- dat1 |>
  mutate(
    microstate = ifelse(country_name == "Taiwan", 0, microstate),
    microstate = ifelse(country_name == "German Democratic Republic", 0, microstate)
  )
# check
dat1 |> 
  filter(is.na(microstate)) 
# no missings


###############################
# Missing values: Income
###############################

# check missings income
missing_income <- dat1 |> 
  filter(is.na(income))
# 222 obs

# missing from WDI: DDR, TWN (Taiwan, missing), VEN (Venezuela, Not classified), Somalia
# Impute manually based on:
# https://ourworldindata.org/grapher/world-banks-income-groups?tab=table
# Taiwan: high income (since 1987)
dat1$income[dat1$iso3c == "TWN" & dat1$year > 1986] <- "High income"
# Venezuela: 'not classified' in WDI download
# Upper-middle-income countries (1987-1993), Lower-middle-income countries (1994-1996), 
# Upper-middle-income countries (1997-2013), High-income countries (2014), Upper-middle-income countries	(2015-2019)
dat1$income[dat1$iso3c == "VEN" & dat1$year > 1986 & dat1$year < 1994] <- "Upper middle income"
dat1$income[dat1$iso3c == "VEN" & dat1$year > 1993 & dat1$year < 1997] <- "Lower middle income"
dat1$income[dat1$iso3c == "VEN" & dat1$year > 1996 & dat1$year < 2014] <- "Upper middle income"
dat1$income[dat1$iso3c == "VEN" & dat1$year == 2014] <- "High income"
dat1$income[dat1$iso3c == "VEN" & dat1$year > 2014 & dat1$year < 2020] <- "Upper middle income" # no dat1a from 2020 onwards!
# Somalia
# Income data not available in WDI when preparing materials for replication (NB GDP data available, not 'income')
dat1$income[dat1$iso3c == "SOM" & dat1$year > 1959] <- "Low income"
# check
missing_income <- dat1 |> 
  filter(is.na(income))
# 88 obs still missing: DDR + early years Taiwan, Venezuela

###############################
# Missing values: GDP
###############################
missing_gdp <- dat1[is.na(dat1$NY.GDP.PCAP.CD), ]
# 812 missing
# mostly data missing before 1980
missing_gdp <- dat1[is.na(dat1$NY.GDP.PCAP.CD), ] |>
  filter(year < 1980) # 534 obs
missing_gdp <- dat1[is.na(dat1$NY.GDP.PCAP.CD), ] |>
  filter(year > 1979) # 278 obs

# Use IMF data
# downloaded data from https://data.imf.org/en/Search-Results#q=gdp%20per%20capita&t=coveob02de888&sort=relevancy
# INDICATOR.ID: NGDPDPC
# Gross domestic product (GDP), Current prices, Per capita, US dollar
# Note: only data from 1980, whereas most missings before 1980
imf_gdp_long <- read_csv("imf_dataset_gdpcapita_2025-07-23.csv")
imf_gdp_long <- imf_gdp_long |>
  rename(NY.GDP.PCAP.CD = OBS_VALUE) |>
  rename(year = TIME_PERIOD) |>
  rename(country_name = COUNTRY) |>
  select(country_name, year, NY.GDP.PCAP.CD)
# 8983 obs of 3 vars
# correct country names for matching
imf_gdp_long <- imf_gdp_long |>
  mutate(
    country_name = ifelse(
      country_name == "Cabo Verde", "Cape Verde",
      ifelse(country_name == "Saint Lucia", "St Lucia", country_name)
    )
  )

dat2 <- dat1 |>
  left_join(imf_gdp_long, by = c("country_name", "year")) |>
  mutate(NY.GDP.PCAP.CD = coalesce(NY.GDP.PCAP.CD.x, NY.GDP.PCAP.CD.y)) |>
  select(-NY.GDP.PCAP.CD.x, -NY.GDP.PCAP.CD.y)
missing_gdp <- dat2[is.na(dat2$NY.GDP.PCAP.CD), ]
# 792 obs
# resolves little

# Creating logged value of GDP
dat2 <- dat2 |>
  mutate(log_gdp = log(NY.GDP.PCAP.CD))
# Normalize the variable using min-max scaling
dat2 <- dat2 |>
  group_by(year) |>
  mutate(gdp_norm = (log_gdp - min(log_gdp, na.rm = T)) / (max(log_gdp, na.rm = T) - min(log_gdp, na.rm = T))) |>
  ungroup()

# add VDEM indicators
# import vdem directly with vdem R package + rename iso3 for country id
load(url("https://github.com/vdeminstitute/vdemdata/raw/master/data/vdem.RData"))
# vdem 27913 obs of 4607 vars
vd <- vdem %>% 
  filter(year > 1959) %>% 
  filter(year < 2024) %>% 
  select(year, country_text_id, v2x_regime, e_v2x_polyarchy_4C, v2x_polyarchy,
         v2svindep, v2x_corr, v2exbribe, v2exembez, v2excrptps, v2exthftps)
dim(vd) # 10729 obs of 11 vars

#merge
dat3 <- merge(dat2, vd, by.x = c("iso3c", "year"), by.y = c("country_text_id", "year"),
              all.x = TRUE)

# download Freedom House
# https://github.com/xmarquez/democracyData
# https://xmarquez.github.io/democracyData/reference/download_fh.html
# remotes::install_github("xmarquez/democracyData")
# library(democracyData)
fh <- download_fh(verbose = FALSE) # 9045 obs of 11 vars

# match country code with iso3c
fh1 <- country_year_coder(
  fh,
  fh_country,
  year,
  code_type = "cown",
  to_system = c("cow"),
  match_type = c("country and code"),
  include_in_output = c("iso3c"),
  verbose = TRUE,
  debug = FALSE,
  match_final_year = FALSE
) # 9095 obs of 12 vars
# difference of 50 obs bc Serbia / Yugoslavia duplicated
# code to filter out duplicates
fh2 <- fh1 %>%
  group_by(cown, year) %>%
  mutate(num_dups = n(),
  dup_id = row_number()) %>%
  ungroup() %>%
  mutate(is_duplicated = dup_id > 1) |>
  filter(is_duplicated == FALSE) |>
  select(-num_dups, -dup_id, -is_duplicated)
# 9045 obs

#merge
dat4 <- merge(dat3, fh2, by.x = c("iso3c", "year"), by.y = c("iso3c", "year"),
              all.x = TRUE)

#create FH variable with labelled categories
dat4$fh_status <- ifelse(dat4$status.y == "NF", "Not Free",
                         ifelse(dat4$status.y == "PF", "Partly Free",
                                ifelse(dat4$status.y == "F", "Free",
                                       NA)))
# remove redundant vars
dat4 <- subset(dat4, select = -c(extended_country_name,iso2c, status.x, status.y, 
                                 fh_country, lastupdated, GWn, cown, in_GW_system) )

# check missing observations on regime variables
na_summary <- dat4 |>
  group_by(year) |>
  summarise(NA_vdem = sum(is.na(v2x_regime)),
            NA_fh = sum(is.na(fh_status)),
            NA_fh_score = sum(is.na(fh_total_reversed)))
# FH available from 1972 with missing in 1981 and 2023
# VDEM available since 1960 but considerable missings (up to 20-21 since 1980s)

# check missing FH after 1971
missing_fh <- dat4[is.na(dat4$fh_total_reversed), ] |>
  filter(year > 1971 & year != 1981 & year != 2023)
# 144 obs 

# Manual inputs for missing countries
# Germany starts in 1990, so before same value will be assigned (always Free and max FH score) - impute only back to 1972 to allign with other data
dat4$fh_total_reversed[dat4$year > 1971 & dat4$year < 1990 & dat4$country_name == "Germany"] <- dat4$fh_total_reversed[dat4$year == 1990 & dat4$country_name == "Germany"]
dat4$fh_status[dat4$year > 1971 & dat4$year < 1990 & dat4$country_name == "Germany"] <- dat4$fh_status[dat4$year == 1990 & dat4$country_name == "Germany"]

# North Macedonia starts in 1991
dat4$fh_total_reversed[dat4$year == 1991 & dat4$country_name == "North Macedonia"] <- dat4$fh_total_reversed[dat4$year == 1992 & dat4$country_name == "North Macedonia"]
dat4$fh_status[dat4$year == 1991 & dat4$country_name == "North Macedonia"] <- dat4$fh_status[dat4$year == 1992 & dat4$country_name == "North Macedonia"]

# Set the values for Czechia based on Freedom House data in website for Czechoslovakia
# https://freedomhouse.org/report/freedom-world#Data
dat4$fh_total_reversed[dat4$country == "Czechia" & dat4$year >= 1971 & dat4$year <= 1974] <- 0
dat4$fh_total_reversed[dat4$country == "Czechia" & dat4$year >= 1975 & dat4$year <= 1988] <- 1
dat4$fh_total_reversed[dat4$country == "Czechia" & dat4$year >= 1989] <- 2
dat4$fh_total_reversed[dat4$country == "Czechia" & dat4$year >= 1990 & dat4$year <= 1992] <- 10
# for FH status
dat4$fh_status[dat4$country == "Czechia" & dat4$year >= 1971 & dat4$year <= 1974] <- "Not Free"
dat4$fh_status[dat4$country == "Czechia" & dat4$year >= 1975 & dat4$year <= 1988] <- "Not Free"
dat4$fh_status[dat4$country == "Czechia" & dat4$year >= 1989] <- "Not Free"
dat4$fh_status[dat4$country == "Czechia" & dat4$year >= 1990 & dat4$year <= 1992] <- "Free"

# Partially MISSING FH scores: Andorra (1977-1992), Micronesia (1986-1990), Liechtenstein (1977-1991), 
# Monaco (1977-1992), Marshall Islands (-1990), San Marino (1977-1992)

# Set the values for Kosovo based on Freedom House data in website
dat4$fh_total_reversed[dat4$country_name == "Kosovo" & dat4$year >= 2008 & dat4$year <= 2013] <- 5
dat4$fh_total_reversed[dat4$country_name == "Kosovo" & dat4$year == 2014] <- 6
dat4$fh_total_reversed[dat4$country_name == "Kosovo" & dat4$year == 2020] <- 6
dat4$fh_total_reversed[dat4$country_name == "Kosovo" & dat4$year >= 2015 & dat4$year <= 2023] <- 7 
#fh_status
dat4$fh_total_reversed[dat4$country_name == "Kosovo" & dat4$year >= 2008 & dat4$year <= 2023] <- "Partly Free"

# Set the values for Liechtenstein based on Freedom House data in website
dat4$fh_total_reversed[dat4$country_name == "Liechtenstein" & dat4$year >= 1971 & dat4$year <= 2015] <- 12
dat4$fh_total_reversed[dat4$country_name == "Liechtenstein" & dat4$year >= 2016 & dat4$year <= 2020] <- 11
dat4$fh_status[dat4$country_name == "Liechtenstein" & dat4$year >= 1971 & dat4$year <= 2023] <- "Free"

# No values for Micronesia before 1991, but after it is always 12 or 11
# Same happens with Marshall Islands, after 1991 it is always 12
# Decision: assign to Micronesia 11 before 1991 and to Marshall Islands 12
# Micronesia adopts a small change in the constitution in 1990, but it was basically the same before
# Marshall Islands had the same constitution since independence
dat4$fh_total_reversed[dat4$country_name == "Marshall Islands" & dat4$year <= 1990] <- 12
dat4$fh_status[dat4$country_name == "Marshall Islands" & dat4$year <= 1990] <- "Free"
dat4$fh_total_reversed[dat4$country_name == "Micronesia, Fed. Sts." & dat4$year <= 1990] <- 11
dat4$fh_status[dat4$country_name == "Micronesia, Fed. Sts." & dat4$year <= 1990] <- "Free"

# check missing FH after 1971
missing_fh <- dat4[is.na(dat4$fh_total_reversed), ] |>
  filter(year > 1971 & year != 1981 & year != 2023)
# 66 obs Andorra (-1993), Micronesia (1986-1990), Monaco (-1993), San Marino (-1992), Yemen (-1990)

# Remaining Problems:
# Andorra changes from 6 in 1976 to 11 in 1993
# Monaco changes from 8 in 1976 to 11 in 1993
# San Marino changes from 10 in 1976 to 12 in 1993
# Yemen was divided between North and South. North was classified some years as Partially free while South was always Not Free
# https://ourworldindata.org/grapher/political-regime-fh?tab=table

# Normalize the variable using min-max scaling
dat4$numeric_fh_total_reversed <- as.numeric(dat4$fh_total_reversed)
dat4 <- dat4 |>
  group_by(year) |>
  mutate(fh_norm = (numeric_fh_total_reversed - min(numeric_fh_total_reversed, na.rm = T)) / (max(numeric_fh_total_reversed, na.rm = T) - min(numeric_fh_total_reversed, na.rm = T))) |>
  ungroup()

### ADDING ISLAND COUNTRIES AND LEGAL TRADITION

load("dat_islands_legal.Rdata")
# see SM1 for sources

dat_islands_legal <- dat_islands_legal |>
  filter(country_name != "Western Samoa")

dat5 <- merge(dat4, dat_islands_legal, by = c("iso3c"), all.x = TRUE)

dat5 <- dat5 |>
  rename(legal_tradition = cat,
         country_name = country_name.x)

# droping variable lending
dat5 = subset(dat5, select = -c(lending) )

# convert island into a categorical variable
dat5$island_country <- ifelse(dat5$island_country == 0, "No",
                                ifelse(dat5$island_country == 1, "Yes", NA))

# reorder income variable
dat5$income <- factor(dat5$income, levels = c("Low income", "Lower middle income", "Upper middle income", "High income"))
dat5 |> count(income)
#               income    n
# 1          Low income 1533
# 2 Lower middle income 2850
# 3 Upper middle income 2847
# 4         High income 3684
# 5                <NA>   88

## ADD SIDS VARIABLE
# NB we end up not using this variable in the analysis
# but keep in for those who are interested
load("dat_sids.Rdata")

dat_sids_2 <- dat_sids_2 |>
  filter(country_name != "Western Samoa")

dat6 <- merge(dat5, dat_sids_2, by = c("iso3c", "country_name"), all.x = TRUE)

# convert island into a categorical variable
dat6$island_country <- ifelse(dat6$sids == 0, "No",
                              ifelse(dat6$sids == 1, "Yes", NA))

# recode common law variable
dat7 <- dat6 |>
  mutate(common_law = case_when(
    legal_tradition == "Common law" ~ 1,
    legal_tradition == "Mixed Common" ~ 1,
    legal_tradition == "Civil law" ~ 0 ,
    legal_tradition == "Mixed" ~ 0 ,
    legal_tradition == "Mixed Muslim" ~ 0 ,
    legal_tradition == "Muslim" ~ 0 ,
    legal_tradition == "Customary" ~ 0 ,
    TRUE ~ NA_real_ # This is for all other values 
  )) |> # late independence variable
  group_by(iso3c) |>
  mutate(
    late_indep = ifelse(min(year) > 1960, 1, 0),
    indep_category = case_when(
      min(year) == 1960 ~ "Indep in 1960",
      min(year) >= 1961 & min(year) <= 1988 ~ "Indep in 1961-88",
      min(year) >= 1989 ~ "Indep from 1989",
      TRUE ~ "Other"  # Add this line if you want to handle other cases
    )) |>
  ungroup()

# imputing the values of common law for some countries
# Serbia, Sudan Germany DR: civil law
# Sudan: mixed (muslim and common)
dat8 <- dat7 |> 
  mutate(
    common_law = ifelse(country_name == "Kosovo", 0, 
                        ifelse(country_name == "German Democratic Republic", 0,
                               ifelse(country_name == "South Sudan", 1, common_law)))
  )

## ADD HAVEN VARIABLE
# or download .csv file from: https://sebastien-laffitte.eu/taxhavens.html
HTHD <- read.csv(url("https://sebastien-laffitte.eu/HTHD/HTHD.csv"))
# 8533 obs of 16 vars
HTHD <- HTHD %>%
  select(iso3, year, haven) %>%
  rename(iso3c = iso3)

dat9 <- merge(dat8, HTHD, by = c("iso3c", "year"), all.x = TRUE)

dat10 <- dat9 %>%
  arrange(iso3c, year) %>%          # Arrange by country and year to ensure correct filling
  group_by(iso3c) %>%               # Group by country
  fill(haven, .direction = "down") %>%  # Fill missing values with the last observed value (assuming this remains stable after 2000)
  ungroup()                         # Ungroup the data

dat10$haven[is.na(dat10$haven)] <- 0


#### LATE INDEPENDENCE

independence_info <- dat10 %>%
  group_by(iso3c) %>%
  summarise(min_year = min(year, na.rm = TRUE)) %>%
  mutate(late_independence_cat = case_when(
    min_year <= 1960 ~ "Independent by 1960",
    min_year > 1960 & min_year <= 1988 ~ "Independent 1961 – 1988",
    min_year > 1988 ~ "Independent post-1988",
    TRUE ~ NA_character_  # handles NAs or other edge cases
  )) %>%
  ungroup()

dat11 <- dat10 %>%
  left_join(independence_info, by = "iso3c") %>%
  select(-min_year)  # Optionally remove the min_year column if not needed

# further data cleaning
# correct country names
library(modeest)
dat11 <- dat11 |>
  group_by(iso3c) |>
  mutate(country = ifelse(is.na(country), mfv(country), country))
# manual correction
dat11$country[dat11$iso3c == "DDR"] <- "German Democratic Republic"
dat11$country[dat11$iso3c == "TWN"] <- "Taiwan"

# selecting variables
dat_full <- dat11 |>
  select(iso3c, country, country_name, year, world_region, 
         cbi_cat, cbi_binary, cbi_gen, 
         cbi_intro, cbi_remov, cbi_change, cbi_gen_intro, cbi_gen_remov, cbi_gen_change,
         SP.POP.TOTL, microstate, island_country,
         NY.GDP.PCAP.CD, log_gdp, income, gdp_norm,
         fh_status, fh_total_reversed, fh_norm, pr, cl, 
         v2x_regime, e_v2x_polyarchy_4C, v2x_polyarchy,
         v2svindep, v2x_corr, v2exbribe, v2exembez, v2excrptps, v2exthftps,
         legal_tradition, common_law, late_indep,
         haven, late_independence_cat)
# 11002 obs of 50 vars

# Export expanded CBI dataset
write.csv(dat_full, file = paste0("01_data/cbi_data_full.csv"), row.names = FALSE)

# Censor the dataset to ensure no country-year observations after CBI introduction
# and reintroduce country-years once CBI is abolished
dat_full_cens <- dat_full %>%
  filter(cbi_binary == 0 | cbi_intro == 1)

# Save censored dataset 
write.csv(dat_full_cens, file = paste0("01_data/cbi_dat_cens.csv"), row.names = FALSE)
# 10546 obs

#####
# IMPUTATION
#####

# impute remaining missings as much as reasonably possible
dat_imp <- dat_full 
# check
plot_missing(dat_imp)

#Income
missing_income <- dat_imp[is.na(dat_imp$income), ] # 88 obs
#impute missing data based on previous or next value by country: income category is time-variant but relatively stable, 
#so imputing down from the first observation and up from the last observations
dat_imp <- dat_imp |>
  group_by(iso3c) |>
  arrange(year) |>
  fill(income, .direction = "updown") |>
  ungroup()
missing_income <- dat_imp[is.na(dat_imp$income), ] # 30 obs

##$GDP: conservative imputation because GDP is more volatile and there are fewer observations
# for some countries first observation in 1990s
missing_gdp <- dat_imp[is.na(dat_imp$gdp_norm), ] # 792 obs
dat_imp <- dat_imp |>
  group_by(iso3c) |>
  arrange(year) |>
  fill(gdp_norm, .direction = "down") |>
  ungroup()
missing_gdp_norm <- dat_imp[is.na(dat_imp$gdp_norm), ] # 750 obs

# FH
missing_fh_status <- dat_imp[is.na(dat_imp$fh_status), ] # 1999 obs
missing_fh_total_reversed <- dat_imp[is.na(dat_imp$fh_total_reversed), ] # 1984 obs
missing_fh_norm <- dat_imp[is.na(dat_imp$fh_norm), ] # 2001 obs
# Conservative imputation: values recorded only when it changes
# fill "down" replacing missing data from top to bottom
# Means that Yemen remains NA until 1990
# Andorra "Partly Free" until 1992
# 2023 imputed based on 2022
dat_imp <- dat_imp |>
  group_by(iso3c) |>
  arrange(year) |>
  fill(fh_status, .direction = "down") |>
  fill(fh_total_reversed, .direction = "down") |>
  fill(fh_norm, .direction = "down") |>
  fill(pr, .direction = "down") |>
  fill(cl, .direction = "down") |>
  ungroup()
missing_fh_status <- dat_imp[is.na(dat_imp$fh_status), ] # 1604 obs
missing_fh_total_reversed <- dat_imp[is.na(dat_imp$fh_total_reversed), ] # 1588 obs
missing_fh_norm <- dat_imp[is.na(dat_imp$fh_norm), ] # 1604 obs

###VDEM
## missings: 10.92%
missing_v2x_regime <- dat_imp[is.na(dat_imp$v2x_regime), ] # 1027 obs
# conservative imputation
dat_imp <- dat_imp |>
  group_by(iso3c) |>
  arrange(year) |>
  fill(v2x_regime, .direction = "down") |>
  fill(e_v2x_polyarchy_4C, .direction = "down") |>
  fill(v2x_polyarchy, .direction = "down") |>
  fill(v2exthftps, .direction = "down") |>
  fill(v2svindep, .direction = "down") |>
  fill(v2x_corr, .direction = "down") |>
  fill(v2exembez, .direction = "down") |>
  fill(v2exbribe, .direction = "down") |>
  fill(v2excrptps, .direction = "down") |>
  fill(pr, .direction = "down") |>
  fill(cl, .direction = "down") |>
  ungroup()
missing_v2x_regime <- dat_imp[is.na(dat_imp$v2x_regime), ] # 1027 obs
# so no changes

# selecting variables
dat_full_imp <- dat_imp |>
  select(iso3c, country, country_name, year, world_region, 
         cbi_cat, cbi_binary, cbi_gen, 
         cbi_intro, cbi_remov, cbi_change, cbi_gen_intro, cbi_gen_remov, cbi_gen_change,
         SP.POP.TOTL, microstate, island_country,
         NY.GDP.PCAP.CD, log_gdp, income, gdp_norm,
         fh_status, fh_total_reversed, fh_norm, pr, cl, 
         v2x_regime, e_v2x_polyarchy_4C, v2x_polyarchy,
         v2svindep, v2x_corr, v2exbribe, v2exembez, v2excrptps, v2exthftps,
         legal_tradition, common_law, late_indep,
         haven, late_independence_cat)
# 11002 obs of 50 vars

# Censor the dataset to ensure no country-year observations after CBI introduction
# and reintroduce country-years once CBI is abolished
dat_imp_cens <- dat_full_imp %>%
  filter(cbi_binary == 0 | cbi_intro == 1)
# 10546 obs

# Save censored dataset 
write.csv(dat_imp_cens, file = paste0("01_data/cbi_dat_imp_cens.csv"), row.names = FALSE)

# Do the same for introduction of generic policies
# Censor the dataset to ensure no country-year observations after CBI introduction
# and reintroduce country-years once CBI is abolished
dat_imp_generic_cens <- dat_full_imp %>%
  filter(cbi_gen == 0 | cbi_gen_intro == 1)
# 9591 obs

# Save censored dataset 
write.csv(dat_imp_generic_cens, file = paste0("01_data/cbi_dat_imp_generic_cens.csv"), row.names = FALSE)

### END



